# -*- coding: utf-8 -*- """PaperSimilInfoTech.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1wYp_EwYSWoDzay0g7Paw2udMyYQ_CoKN """ # Calculating similarity in different ways # Similarity distances by the number of vertices in the taxonomy tree # https://habr.com/ru/articles/778048/ #v.17.04.2024 # The CBOW approach stands for Continuous Bag of Words, which translates to "continuous bag of words". from gensim import models !wget -c "https://rzn-obr.ru/GoogleNews-vectors-negative300.bin.gz" !gzip -d GoogleNews-vectors-negative300.bin.gz w2v = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) vect = w2v['Shooting'] w2v.most_similar('Shooting') vect = w2v['attack'] w2v.most_similar('attack') word1 = 'attack' word2 = 'Shooting' similarity = w2v.similarity(word1, word2) print(f'Semantic similarity between "{word1}" и "{word2}" is equal {similarity}') word1 = 'attack' word2 = 'bombing' similarity = w2v.similarity(word1, word2) print(f'Semantic similarity between "{word1}" и "{word2}" is equal {similarity}') # A function for calculating the average vector of a phrase import numpy as np def average_vector(model, phrase): words = [word for word in phrase.split() if word in model.key_to_index] if not words: return None vectors = [model[word] for word in words] avg_vector = np.mean(vectors, axis=0) return avg_vector # Calculating the semantic similarity between two phrases phrase1 = 'первое словосочетание' phrase2 = 'второе словосочетание' phrase1 = 'attack attack' phrase2 = 'venue leaves' avg_vector1 = average_vector(w2v, phrase1) avg_vector2 = average_vector(w2v, phrase2) if avg_vector1 is not None and avg_vector2 is not None: similarity = np.dot(avg_vector1, avg_vector2) / (np.linalg.norm(avg_vector1) * np.linalg.norm(avg_vector2)) print(f'Semantic similarity between "{phrase1}" и "{phrase2}" равно {similarity}') else: print('Одно из словосочетаний содержит только слова, которых нет в словаре модели.') # On a certain date, there was an attack on a city facility where many people became victims. # В определенную дату была атака на объект города где жертвами стали много человек. # March 23, 2024 Shooting at Moscow concert venue leaves over 130 dead. # 23 марта 2024 в результате стрельбы на концертной площадке в Москве погибло более 130 человек. # sklearn from transformers import BertTokenizer, BertModel import torch from scipy.spatial.distance import cosine from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity # Calculating semantic similarity of sentences sentence1 = "On a certain date, there was an attack on a city facility where many people became victims." # cnn 1 1 sentence2 = "On March 23, 2024, terrorists attacked Moscow, killing more than 130 citizens." # IYu 0.904102623462677 0.812596321105957 sentence3 = "March 23 2024 terrorists strike at a packed concert hall in the Russian capital, leaving at least 60 dead." # RT 0.9349522590637207 0.8757287859916687 sentence4 = "On the morning of 11.09.2001, two Boston planes destroyed the World Trade Center in New York." # NYTimes 0.8471251726150513 0.7009884119033813 sentence5 = "On October 11, 2022, a new multifunctional medical center was opened in Lugansk." # TASS 0.8411691188812256 0.6638354063034058 sentences = ['On a certain date, there was an attack on a city facility where many people became victims.', 'On March 23, 2024, terrorists attacked Moscow, killing more than 130 citizens.', 'On October 11, 2022, a new multifunctional medical center was opened in Vladimir.'] vectorizer = CountVectorizer() sentence_vectors = vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(sentence_vectors, sentence_vectors) print(similarity_matrix) # Жаккар def jaccard_similarity(x,y): """ returns the jaccard similarity between two lists """ intersection_cardinality = len(set.intersection(*[set(x), set(y)])) union_cardinality = len(set.union(*[set(x), set(y)])) return intersection_cardinality/float(union_cardinality) sentences = [ #'On a certain date, there was an attack on a city facility where many people became victims.', 'On March 23, 2024, terrorists attacked Moscow, killing more than 130 citizens.', 'On October 11, 2022, a new multifunctional medical center was opened in Lugansk.'] # sentences = [sent.lower().split(" ") for sent in sentences] jaccard_similarity(sentences[0], sentences[1]) from nltk.corpus import wordnet as wn from nltk.tokenize import word_tokenize import nltk nltk.download('punkt') nltk.download('wordnet') # A function for getting synonyms of each word in a sentence def get_synonyms(sentence_tokens): synonyms = [] for word in sentence_tokens: for syn in wn.synsets(word): for lemma in syn.lemmas(): synonyms.append(lemma.name()) return set(synonyms) # A function for calculating the semantic similarity of two sentences def sentence_similarity(sentence1, sentence2): tokens1 = word_tokenize(sentence1) tokens2 = word_tokenize(sentence2) synonyms1 = get_synonyms(tokens1) synonyms2 = get_synonyms(tokens2) score = 0.0 count = 0 for word1 in synonyms1: best_score = 0.0 for word2 in synonyms2: sim = wn.synsets(word1)[0].wup_similarity(wn.synsets(word2)[0]) if sim is not None and sim > best_score: best_score = sim score += best_score count += 1 if count == 0: return 0.0 return score / count # An example of two sentences for calculating semantic similarity similarity_score = sentence_similarity(sentence1, sentence2) print(f'An example of two sentences for calculating semantic similarity: {similarity_score}') #!pip install nltk import nltk from nltk.corpus import wordnet nltk.download('wordnet') def lesk_similarity(word1, word2): synset1 = wordnet.synsets(word1) synset2 = wordnet.synsets(word2) max_similarity = -1 for sense1 in synset1: for sense2 in synset2: similarity = sense1.wup_similarity(sense2) if similarity is not None and similarity > max_similarity: max_similarity = similarity return max_similarity word1 = 'awareness' # Осведомление word2 = 'informing' similarity = lesk_similarity(word1, word2) print(f"The similarity between '{word1}' and '{word2}' is: {similarity}")